Fun with Word Embeddings
Christopher Δ 05/2025- load a word2vec embedding model
- apply the model to a relatively short list of common English words
- begin at a specified word and engage in a 'random walk' limited to unvisited, similar words
- store these in an ordered list
- calculate the first four principal components of the visited words, reducing the embedding dimensionality from 300 to 4
- generate an animated plot, where x, y, size, and color represent the respective principal components
In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
import gensim.downloader as api
from gensim.models import KeyedVectors
MODEL_NAME = "word2vec-google-news-300" # 3M words embedded in 300 dimensions
WORD_LIST_PATH = "/usr/share/dict/words" # 75k words
SUBSET_FILEN = "model_subset.wv"
try:
model = KeyedVectors.load_word2vec_format(SUBSET_FILEN)
word_list = [w.strip().lower() for w in open(WORD_LIST_PATH, "r")]
words_in_common = [w for w in word_list if w in model]
except:
model = api.load(MODEL_NAME)
word_list = [w.strip().lower() for w in open(WORD_LIST_PATH, "r")]
words_in_common = [w for w in word_list if w in model] # eliminate words that are not modeled
model = model.vectors_for_all(words_in_common) # subset model
model.save_word2vec_format(SUBSET_FILEN)
In [2]:
def random_talk(token, N=5, limit=128, model=model, words_in_common=words_in_common):
"""
Takes a word, finds the N most similar words (according to cosine),
chooses one at random.
If we've been there before, checks others until we find one. Yields word.
"""
never_here = set(words_in_common)
been_here = set()
for i in range(limit):
if i % 1000 == 0 and i > 1:
model = model.vectors_for_all(never_here)
yield(token)
been_here.add(token)
never_here.remove(token)
these = model.most_similar(token, topn=N)
rando = np.random.randint(N)
last_token = token
token = these[rando][0]
j = 1
n = N
while token in been_here: # ensure we haven't visited
token = these[(rando + j) % N][0]
j += 1
if i > N:
n += 1
these = model.most_similar(last_token, topn=n)
token = these[n-1][0]
def convert_to_pca_model(w2v_model, n_components=2):
vectors = np.array([w2v_model[word] for word in w2v_model.index_to_key])
pca = PCA(n_components=n_components)
pca.fit(vectors)
return pca
In [3]:
words = [w for w in random_talk("pianoforte", 4, 512)]
pca = convert_to_pca_model(model, n_components=4)
pca_dict = {key: pca.transform(model[key].reshape(1,-1)).reshape(-1) for key in model.index_to_key} # 😎
component_list = [[key] + list(pca_dict[key]) for key in words]
df = pd.DataFrame(component_list, columns = ["token", "pc1", "pc2", "pc3", "pc4"])
df.pc3 = pd.Series([int(x) for x in (df.pc3 + 1.67)**1.7 * 100]) # needs to be a positive integer
df.head(8)
Out[3]:
| token | pc1 | pc2 | pc3 | pc4 | |
|---|---|---|---|---|---|
| 0 | pianoforte | -0.436271 | 0.352430 | 158 | 0.162577 |
| 1 | piano | -0.588039 | 0.458752 | 293 | 0.211561 |
| 2 | cello | -0.765732 | 0.396059 | 247 | 0.198156 |
| 3 | violin | -0.609043 | 0.371390 | 285 | -0.016582 |
| 4 | viola | -0.597332 | 0.362715 | 240 | 0.204763 |
| 5 | bassoon | -0.645873 | 0.532368 | 225 | 0.153122 |
| 6 | oboe | -0.667731 | 0.431452 | 200 | 0.098334 |
| 7 | flute | -0.833134 | 0.494061 | 276 | 0.154512 |
In [4]:
ext = 0.15
range_x = [min(df.pc1) - ext, max(df.pc1) + ext]
range_y = [min(df.pc2) - ext, max(df.pc2) + ext]
range_c = [min(df.pc4) , max(df.pc4) ]
fig = px.scatter(df, x = "pc1", y = "pc2", size = "pc3", color = "pc4",
text="token", size_max=120, height=800, width=800,
range_x = range_x, range_y = range_y, range_color = range_c,
title="Bouncing around Word Space", subtitle = "size ~ pc3\n",
animation_frame = df.index)
fig.update_layout(font=dict(size=20))
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 450
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 420
fig.show()
fig.write_html("word_space_animation.html") # to be spliced into exported notebook